import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np
%matplotlib inline
df_unencoded = pd.read_excel("data.xlsx")
df_unencoded.head(3)
| size_m2 | bathrooms | meters_from_metro | has_garage | property_type | price | |
|---|---|---|---|---|---|---|
| 0 | 72 | 3 | 411 | yes | detached | 358356 |
| 1 | 57 | 1 | 46 | no | detached | 310489 |
| 2 | 67 | 2 | 1281 | no | terraced | 156408 |
df_unencoded.info()
df_unencoded = df_unencoded.dropna(how='any')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1998 entries, 0 to 1997 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 size_m2 1998 non-null int64 1 bathrooms 1998 non-null int64 2 meters_from_metro 1998 non-null int64 3 has_garage 1997 non-null object 4 property_type 1998 non-null object 5 price 1998 non-null int64 dtypes: int64(4), object(2) memory usage: 93.8+ KB
unique_values = df_unencoded['bathrooms'].unique()
unique_values = np.sort(unique_values)
bin_edges = np.convolve(unique_values, np.array([0.5, 0.5]), 'valid')
bin_edges = np.concatenate((np.array([unique_values.min() - 0.5]), bin_edges, np.array([unique_values.max() + 0.5])))
histogram = sns.histplot(data=df_unencoded, x="bathrooms", bins=bin_edges)
histogram = sns.histplot(data=df_unencoded, x="size_m2")
sns.scatterplot(x="size_m2",
y="price",
hue="property_type",
data=df_unencoded)
<AxesSubplot:xlabel='size_m2', ylabel='price'>
sns.scatterplot(x="meters_from_metro",
y="price",
hue="property_type",
data=df_unencoded)
<AxesSubplot:xlabel='meters_from_metro', ylabel='price'>
sns.heatmap(df_unencoded.corr(),
cmap='Reds',
annot=True)
plt.title('Correlation Matrix');
def encode(df):
df['has_garage'] = df['has_garage'].replace(['yes'],1)
df['has_garage'] = df['has_garage'].replace(['no'],0)
dummies = pd.get_dummies(df.property_type) # creates 3 new binary columns for the 3 towns
df = pd.concat([df, dummies],axis='columns')
df = df.drop(['property_type', 'terraced'], axis=1) # if there are more than two one hot enconded columns, drop one
return df
df_encoded = encode(df_unencoded)
df_encoded
| size_m2 | bathrooms | meters_from_metro | has_garage | price | detached | semi-detached | |
|---|---|---|---|---|---|---|---|
| 0 | 72 | 3 | 411 | 1 | 358356 | 1 | 0 |
| 1 | 57 | 1 | 46 | 0 | 310489 | 1 | 0 |
| 2 | 67 | 2 | 1281 | 0 | 156408 | 0 | 0 |
| 3 | 58 | 1 | 928 | 1 | 294774 | 1 | 0 |
| 4 | 112 | 1 | 73 | 0 | 468561 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1993 | 80 | 2 | 999 | 1 | 249165 | 0 | 1 |
| 1994 | 93 | 1 | 834 | 0 | 386007 | 0 | 0 |
| 1995 | 70 | 2 | 293 | 0 | 415181 | 1 | 0 |
| 1996 | 56 | 2 | 687 | 1 | 215375 | 0 | 1 |
| 1997 | 69 | 1 | 522 | 0 | 241538 | 0 | 0 |
1997 rows × 7 columns
X = df_encoded[['size_m2', 'bathrooms', 'meters_from_metro', 'has_garage', 'detached', 'semi-detached']]
y = df_encoded['price']
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())
model.fit(X, y)
print("The coefficients are: " + str(model.coef_))
print("The intercept/constant is: " + str(model.intercept_))
Cross-validation scores: [0.86022103 0.85712256 0.85362275 0.84165308 0.82590028] Mean cross-validation score: 0.8477039394082991 The coefficients are: [ 3470.26184982 21463.6107701 -131.99294886 5652.71566282 109705.20322427 23904.38558323] The intercept/constant is: 51818.0074061433
df_new_unencoded = pd.read_csv('data_to_predict_on.csv')
df_new_encoded = encode(df_new_unencoded)
df_new_encoded.head()
| size_m2 | bathrooms | meters_from_metro | has_garage | detached | semi-detached | |
|---|---|---|---|---|---|---|
| 0 | 100 | 3 | 207 | 1 | 0 | 1 |
| 1 | 121 | 3 | 356 | 0 | 1 | 0 |
| 2 | 90 | 2 | 201 | 0 | 1 | 0 |
| 3 | 59 | 1 | 692 | 0 | 0 | 0 |
| 4 | 61 | 1 | 482 | 1 | 0 | 0 |
prediction = model.predict(df_new_encoded)
df_new_unencoded['predicted_price'] = prediction.round(2)
df_new_unencoded.head()
| size_m2 | bathrooms | meters_from_metro | has_garage | property_type | predicted_price | |
|---|---|---|---|---|---|---|
| 0 | 100 | 3 | 207 | 1 | semi-detached | 465469.59 |
| 1 | 121 | 3 | 356 | 0 | detached | 598826.24 |
| 2 | 90 | 2 | 201 | 0 | detached | 490243.42 |
| 3 | 59 | 1 | 692 | 0 | terraced | 186687.95 |
| 4 | 61 | 1 | 482 | 1 | terraced | 226999.71 |